#!/usr/bin/env python
# encoding: utf-8
"""
@summary: 判断refer没有记录上的日志都来自哪个地区
@attention: 尽量不要再hadoop中输出中文
@author: hongxingfan
@since: 2014年8月15日    下午3:11:41
"""
import sys
import urllib2
import re

if __name__ == "__main__":
    
    pattern = re.compile(r"cname\":\"(.*?)\"")
    ip_hash = {}
    
    for line in sys.stdin:
        line = line.strip()
        cols = line.split("\t")
        
        ip = cols[1]
        refer = cols[8]
        pv = 1
        key = ""
        
        if refer == "-":
            try:
                # 计算技巧
                if ip in ip_hash:
                    key = ip_hash.get(ip)
                    print("%s\t%d" % (key, pv))
                    continue
                handler = urllib2.urlopen("http://ip.sce.sohu.com/ipservice?ip=" + ip)
                content = handler.read()
                match = pattern.search(content)
                if match:
                    key = match.group(1)
                    ip_hash[ip] = key
                    print("%s\t%d" % (key, pv))
            except urllib2.URLError, e:
                sys.stderr.write(e.reason)
